pew %>% gather(2:7, key = "income_bracket", value = "count")
billboard %>% gather(6:81, key = "week", value = "rank")
weather %>% gather(d1:d8, key = "day", value = "temp") %>% spread(key = "element", value = "temp")
separate()ed.unite()ed.A
## # A tibble: 6 x 3 ## country year rate ## * <chr> <int> <chr> ## 1 Afghanistan 1999 745/19987071 ## 2 Afghanistan 2000 2666/20595360 ## 3 Brazil 1999 37737/172006362 ## 4 Brazil 2000 80488/174504898 ## 5 China 1999 212258/1272915272 ## 6 China 2000 213766/1280428583
A %>%
separate(rate, into = c("cases", "population"))
## # A tibble: 6 x 4 ## country year cases population ## * <chr> <int> <chr> <chr> ## 1 Afghanistan 1999 745 19987071 ## 2 Afghanistan 2000 2666 20595360 ## 3 Brazil 1999 37737 172006362 ## 4 Brazil 2000 80488 174504898 ## 5 China 1999 212258 1272915272 ## 6 China 2000 213766 1280428583
A %>%
separate(rate, into = c("cases", "population"), sep = "/")
## # A tibble: 6 x 4 ## country year cases population ## * <chr> <int> <chr> <chr> ## 1 Afghanistan 1999 745 19987071 ## 2 Afghanistan 2000 2666 20595360 ## 3 Brazil 1999 37737 172006362 ## 4 Brazil 2000 80488 174504898 ## 5 China 1999 212258 1272915272 ## 6 China 2000 213766 1280428583
A %>%
separate(rate, into = c("cases", "population"), convert = TRUE)
## # A tibble: 6 x 4 ## country year cases population ## * <chr> <int> <int> <int> ## 1 Afghanistan 1999 745 19987071 ## 2 Afghanistan 2000 2666 20595360 ## 3 Brazil 1999 37737 172006362 ## 4 Brazil 2000 80488 174504898 ## 5 China 1999 212258 1272915272 ## 6 China 2000 213766 1280428583
This is a less common operation than separate() (though it is needed for Exercise 5.7). Imagine you were working with this (butchered) form of data set C.
C2 <- C %>%
separate(year, into = c("century", "year_in_century"), sep = 2)
C2
## # A tibble: 6 x 5 ## country century year_in_century cases population ## <chr> <chr> <chr> <int> <int> ## 1 Afghanistan 19 99 745 19987071 ## 2 Afghanistan 20 00 2666 20595360 ## 3 Brazil 19 99 37737 172006362 ## 4 Brazil 20 00 80488 174504898 ## 5 China 19 99 212258 1272915272 ## 6 China 20 00 213766 1280428583
We need to unite() the century and year_in_century columns.
C2 %>% unite(century, year_in_century, col = "year", sep = "")
## # A tibble: 6 x 4 ## country year cases population ## <chr> <chr> <int> <int> ## 1 Afghanistan 1999 745 19987071 ## 2 Afghanistan 2000 2666 20595360 ## 3 Brazil 1999 37737 172006362 ## 4 Brazil 2000 80488 174504898 ## 5 China 1999 212258 1272915272 ## 6 China 2000 213766 1280428583
gather() the columns into values.spread()ing the values across the columnsseparate()ed.unite()ed.Do you ever find yourself with .Rmd files that look like this?
my_df1 %>% ... # do some stuff to my_df1 ... my_df2 %>% ... # do the same stuff to my_df2 ... my_df3 %>% ... # and again to my_df3 ...
What if I want to draw the same kind of plot several times?
my_df1 %>% ggplot(aes(x = var1, y = var2, color = var3)) + geom_point() + geom_line() my_df2 %>% ggplot(aes(x = varA, y = varB, color = varC)) + geom_point() + geom_line() my_df3 %>% ggplot(aes(x = var1A, y = var2B, color = var3C)) + geom_point() + geom_line()
name_of_function <- function(data, var = "value") {
. . .
. . .
<valid R code>
. . .
. . .
return(x)
}
data, var
data is requiredvar is optional - has a default value of"value"`xlibrary(tidyverse)
my_cars <- function(mod) {
mpg %>%
filter(model == mod)
}
my_cars("protege")
## # A tibble: 0 x 11 ## # ... with 11 variables: manufacturer <chr>, model <chr>, displ <dbl>, ## # year <int>, cyl <int>, trans <chr>, drv <chr>, cty <int>, hwy <int>, ## # fl <chr>, class <chr>
my_cars <- function(mod = "civic") {
mpg %>%
filter(model == mod)
}
my_cars()
## # A tibble: 9 x 11 ## manufacturer model displ year cyl trans drv cty hwy fl class ## <chr> <chr> <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr> ## 1 honda civic 1.60 1999 4 manu… f 28 33 r subc… ## 2 honda civic 1.60 1999 4 auto… f 24 32 r subc… ## 3 honda civic 1.60 1999 4 manu… f 25 32 r subc… ## 4 honda civic 1.60 1999 4 manu… f 23 29 p subc… ## 5 honda civic 1.60 1999 4 auto… f 24 32 r subc… ## 6 honda civic 1.80 2008 4 manu… f 26 34 r subc… ## 7 honda civic 1.80 2008 4 auto… f 25 36 r subc… ## 8 honda civic 1.80 2008 4 auto… f 24 36 c subc… ## 9 honda civic 2.00 2008 4 manu… f 21 29 p subc…
my_cars("jetta")
## # A tibble: 9 x 11 ## manufacturer model displ year cyl trans drv cty hwy fl class ## <chr> <chr> <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr> ## 1 volkswagen jetta 1.90 1999 4 manu… f 33 44 d comp… ## 2 volkswagen jetta 2.00 1999 4 manu… f 21 29 r comp… ## 3 volkswagen jetta 2.00 1999 4 auto… f 19 26 r comp… ## 4 volkswagen jetta 2.00 2008 4 auto… f 22 29 p comp… ## 5 volkswagen jetta 2.00 2008 4 manu… f 21 29 p comp… ## 6 volkswagen jetta 2.50 2008 5 auto… f 21 29 r comp… ## 7 volkswagen jetta 2.50 2008 5 manu… f 21 29 r comp… ## 8 volkswagen jetta 2.80 1999 6 auto… f 16 23 r comp… ## 9 volkswagen jetta 2.80 1999 6 manu… f 17 24 r comp…
my_cars("camry") %>%
head(2)
## # A tibble: 2 x 11 ## manufacturer model displ year cyl trans drv cty hwy fl class ## <chr> <chr> <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr> ## 1 toyota camry 2.20 1999 4 manu… f 21 29 r mids… ## 2 toyota camry 2.20 1999 4 auto… f 21 27 r mids…
my_cars(mod = "corolla") %>% head(2)
## # A tibble: 2 x 11 ## manufacturer model displ year cyl trans drv cty hwy fl class ## <chr> <chr> <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr> ## 1 toyota coro… 1.80 1999 4 auto… f 24 30 r comp… ## 2 toyota coro… 1.80 1999 4 auto… f 24 33 r comp…
Pay attention to:
What does this do?
most_popular_year <- function(data, name_arg) {
data %>%
filter(name == name_arg) %>%
group_by(year) %>%
summarize(total = sum(prop)) %>%
arrange(desc(total)) %>%
head(1) %>%
select(year)
}
library(babynames) most_popular_year(data = babynames, name_arg = "Andrew")
## # A tibble: 1 x 1 ## year ## <dbl> ## 1 1987
most_popular_year(babynames, "Andrew")
## # A tibble: 1 x 1 ## year ## <dbl> ## 1 1987
# most_popular_year("Andrew")
For the following exercises, use the pnwflights14 dataset.
https://data.oregon.gov/Business/New-Businesses-Registered-Last-Month/esjy-u4fc
library(tidyverse)
biz <- read_csv("../data/New_Businesses_Registered_Last_Month.csv")
?read_csv
str(biz)
## Classes 'tbl_df', 'tbl' and 'data.frame': 19674 obs. of 17 variables: ## $ Registry Number : int 139275697 139275697 139253991 139253991 139253991 139277594 139277594 139255491 139255491 139255491 ... ## $ Business Name : chr "A PIECE OF HEAVEN ADULT CARE HOME" "A PIECE OF HEAVEN ADULT CARE HOME" "A. RE PLUMBING LLC" "A. RE PLUMBING LLC" ... ## $ Entity Type : chr "ASSUMED BUSINESS NAME" "ASSUMED BUSINESS NAME" "DOMESTIC LIMITED LIABILITY COMPANY" "DOMESTIC LIMITED LIABILITY COMPANY" ... ## $ Registry Date : chr "01/02/2018" "01/02/2018" "01/02/2018" "01/02/2018" ... ## $ Associated Name Type : chr "AUTHORIZED REPRESENTATIVE" "PRINCIPAL PLACE OF BUSINESS" "MAILING ADDRESS" "PRINCIPAL PLACE OF BUSINESS" ... ## $ First Name : chr "OLIMPIA" NA NA NA ... ## $ Middle Name : chr "V" NA NA NA ... ## $ Last Name : chr "URSU" NA NA NA ... ## $ Suffix : chr NA NA NA NA ... ## $ Not of Record Entity : chr NA NA NA NA ... ## $ Entity of Record Reg Number: int NA NA NA NA NA NA NA NA NA NA ... ## $ Entity of Record Name : chr NA NA NA NA ... ## $ Address : chr "13460 SE RUSK RD" "13460 SE RUSK RD" "241 NW HOWARD LN" "241 NW HOWARD LN" ... ## $ Address Continued : chr NA NA NA NA ... ## $ City : chr "MILWAUKIE" "MILWAUKIE" "DALLAS" "DALLAS" ... ## $ State : chr "OR" "OR" "OR" "OR" ... ## $ Zip Code : chr "97222" "97222" "97338" "97338" ... ## - attr(*, "spec")=List of 2 ## ..$ cols :List of 17 ## .. ..$ Registry Number : list() ## .. .. ..- attr(*, "class")= chr "collector_integer" "collector" ## .. ..$ Business Name : list() ## .. .. ..- attr(*, "class")= chr "collector_character" "collector" ## .. ..$ Entity Type : list() ## .. .. ..- attr(*, "class")= chr "collector_character" "collector" ## .. ..$ Registry Date : list() ## .. .. ..- attr(*, "class")= chr "collector_character" "collector" ## .. ..$ Associated Name Type : list() ## .. .. ..- attr(*, "class")= chr "collector_character" "collector" ## .. ..$ First Name : list() ## .. .. ..- attr(*, "class")= chr "collector_character" "collector" ## .. ..$ Middle Name : list() ## .. .. ..- attr(*, "class")= chr "collector_character" "collector" ## .. ..$ Last Name : list() ## .. .. ..- attr(*, "class")= chr "collector_character" "collector" ## .. ..$ Suffix : list() ## .. .. ..- attr(*, "class")= chr "collector_character" "collector" ## .. ..$ Not of Record Entity : list() ## .. .. ..- attr(*, "class")= chr "collector_character" "collector" ## .. ..$ Entity of Record Reg Number: list() ## .. .. ..- attr(*, "class")= chr "collector_integer" "collector" ## .. ..$ Entity of Record Name : list() ## .. .. ..- attr(*, "class")= chr "collector_character" "collector" ## .. ..$ Address : list() ## .. .. ..- attr(*, "class")= chr "collector_character" "collector" ## .. ..$ Address Continued : list() ## .. .. ..- attr(*, "class")= chr "collector_character" "collector" ## .. ..$ City : list() ## .. .. ..- attr(*, "class")= chr "collector_character" "collector" ## .. ..$ State : list() ## .. .. ..- attr(*, "class")= chr "collector_character" "collector" ## .. ..$ Zip Code : list() ## .. .. ..- attr(*, "class")= chr "collector_character" "collector" ## ..$ default: list() ## .. ..- attr(*, "class")= chr "collector_guess" "collector" ## ..- attr(*, "class")= chr "col_spec"
glimpse(biz)
## Observations: 19,674 ## Variables: 17 ## $ `Registry Number` <int> 139275697, 139275697, 139253991,... ## $ `Business Name` <chr> "A PIECE OF HEAVEN ADULT CARE HO... ## $ `Entity Type` <chr> "ASSUMED BUSINESS NAME", "ASSUME... ## $ `Registry Date` <chr> "01/02/2018", "01/02/2018", "01/... ## $ `Associated Name Type` <chr> "AUTHORIZED REPRESENTATIVE", "PR... ## $ `First Name` <chr> "OLIMPIA", NA, NA, NA, "ANTHONY"... ## $ `Middle Name` <chr> "V", NA, NA, NA, NA, NA, "A", NA... ## $ `Last Name` <chr> "URSU", NA, NA, NA, "RE", NA, "G... ## $ Suffix <chr> NA, NA, NA, NA, NA, NA, NA, NA, ... ## $ `Not of Record Entity` <chr> NA, NA, NA, NA, NA, NA, NA, NA, ... ## $ `Entity of Record Reg Number` <int> NA, NA, NA, NA, NA, NA, NA, NA, ... ## $ `Entity of Record Name` <chr> NA, NA, NA, NA, NA, NA, NA, NA, ... ## $ Address <chr> "13460 SE RUSK RD", "13460 SE RU... ## $ `Address Continued` <chr> NA, NA, NA, NA, NA, NA, NA, NA, ... ## $ City <chr> "MILWAUKIE", "MILWAUKIE", "DALLA... ## $ State <chr> "OR", "OR", "OR", "OR", "OR", "O... ## $ `Zip Code` <chr> "97222", "97222", "97338", "9733...
Q1: What is the trend in new business licenses over the last month?
Are these distinct businesses?
biz %>% head(3)
## # A tibble: 3 x 17 ## `Registry Number` `Business Name` `Entity Type` `Registry Date` ## <int> <chr> <chr> <chr> ## 1 139275697 A PIECE OF HEAVEN … ASSUMED BUSINESS … 01/02/2018 ## 2 139275697 A PIECE OF HEAVEN … ASSUMED BUSINESS … 01/02/2018 ## 3 139253991 A. RE PLUMBING LLC DOMESTIC LIMITED … 01/02/2018 ## # ... with 13 more variables: `Associated Name Type` <chr>, `First ## # Name` <chr>, `Middle Name` <chr>, `Last Name` <chr>, Suffix <chr>, ## # `Not of Record Entity` <chr>, `Entity of Record Reg Number` <int>, ## # `Entity of Record Name` <chr>, Address <chr>, `Address ## # Continued` <chr>, City <chr>, State <chr>, `Zip Code` <chr>
Q1: What is the trend in new business licenses over the last month?
biz <- biz %>%
distinct(`Business Name`, .keep_all = TRUE)
biz
## # A tibble: 7,291 x 17 ## `Registry Number` `Business Name` `Entity Type` `Registry Date` ## <int> <chr> <chr> <chr> ## 1 139275697 A PIECE OF HEAVEN … ASSUMED BUSINESS… 01/02/2018 ## 2 139253991 A. RE PLUMBING LLC DOMESTIC LIMITED… 01/02/2018 ## 3 139277594 ADVANCED CANDY AND… DOMESTIC BUSINES… 01/02/2018 ## 4 139255491 ALEJANDRAS LAWN SE… DOMESTIC LIMITED… 01/02/2018 ## 5 139145395 ANIMATION SEO INC. DOMESTIC BUSINES… 01/02/2018 ## 6 139232797 BEAUTIFUL BLINDS A… DOMESTIC LIMITED… 01/02/2018 ## 7 139253090 BETHRAL LLC DOMESTIC LIMITED… 01/02/2018 ## 8 139246599 BOND CONSTRUCTION … DOMESTIC LIMITED… 01/02/2018 ## 9 139257794 D&C GOODS LLC DOMESTIC LIMITED… 01/02/2018 ## 10 139276190 D&D AFFORDABLE TIR… DOMESTIC LIMITED… 01/02/2018 ## # ... with 7,281 more rows, and 13 more variables: `Associated Name ## # Type` <chr>, `First Name` <chr>, `Middle Name` <chr>, `Last ## # Name` <chr>, Suffix <chr>, `Not of Record Entity` <chr>, `Entity of ## # Record Reg Number` <int>, `Entity of Record Name` <chr>, ## # Address <chr>, `Address Continued` <chr>, City <chr>, State <chr>, ## # `Zip Code` <chr>
Q1: What is the trend in new business licenses over the last month?
Here is our key column:
class(biz$`Registry Date`)
## [1] "character"
But we'd like a data frame that can do this:
ggplot(biz, aes(x = date, y = count)) + geom_line()
integernumericlogicalcharacterfactorA package to represent datetime data, do operations on it, and output it in various formats.
You can create <date> data using a variety of functions tailored to the format of the character string.
library(lubridate)
ymd("2018-02-22")
## [1] "2018-02-22"
mdy("February 22nd, 2018")
## [1] "2018-02-22"
dmy("22-Feb-2018")
## [1] "2018-02-22"
You can create <dttm> (date time) data by extending the same syntax.
mdy_hm("02/22/2018 06:26")
## [1] "2018-02-22 06:26:00 UTC"
You can also cobble together a <dttm> from across multiple columns.
flights %>% select(year, month, day, hour, minute) %>% mutate(departure = make_datetime(year, month, day, hour, minute))
Once you have data represented as a date-time, it's easy to pull out components that you're interested in.
now <- now() now
## [1] "2018-02-22 14:02:50 PST"
day(now)
## [1] 22
hour(now)
## [1] 14
minute(now)
## [1] 2
second(now)
## [1] 50.72688
wday(now)
## [1] 5
wday(now, label = TRUE)
## [1] Thu ## Levels: Sun < Mon < Tue < Wed < Thu < Fri < Sat
wday(now, label = TRUE, abbr = FALSE)
## [1] Thursday ## 7 Levels: Sunday < Monday < Tuesday < Wednesday < Thursday < ... < Saturday
joy <- mdy(04272018) - today() class(joy)
## [1] "difftime"
joy
## Time difference of 64 days
joy <- as.duration(joy) joy
## [1] "5529600s (~9.14 weeks)"
Use pnwflights14 to answer the following questions.
d1 <- "January 1, 2010"
d2 <- "2015-Mar-07"
d3 <- "06-Jun-2017"
d4 <- c("August 19 (2015)", "July 1 (2015)")
d5 <- "12/30/14" # Dec 30, 2014
pnwflights14 flights data, on what day of the week should you leave PDX if you want to minimize the chance of a delay?d <- dmy(biz$`Registry Date`)
## Warning: 4364 failed to parse.
?dmy
d <- mdy(biz$`Registry Date`) class(d)
## [1] "Date"
head(d)
## [1] "2018-01-02" "2018-01-02" "2018-01-02" "2018-01-02" "2018-01-02" ## [6] "2018-01-02"
Q1: What is the trend in new business licenses over the last month?
First a barchart.
biz <- biz %>% mutate(registry_date = mdy(`Registry Date`)) ggplot(biz, aes(x = registry_date)) + geom_bar()
Q1: What is the trend in new business licenses over the last month?
Let's try a line chart
biz %>% group_by(registry_date) %>% summarize(count = n()) %>% ggplot(aes(x = registry_date, y = count)) + geom_line()
Q2: What is the weekly cycle in new business licenses?
library(rvest)
url <- "https://www.nytimes.com/interactive/2018/sports/olympics/medal-count-results-schedule.html?smid=tw-nytimes&smtyp=cur"
tables <- url %>%
read_html() %>%
html_nodes("table")
length(tables)
tab1 <- html_table(tables[[1]])
tab1 <- html_table(tables[[1]], header = TRUE)
tab1_sep <- tab1 %>%
separate("Medal Count",
into = c("country","country_code"),
sep = -3)
olympics <- tab1_sep %>%
gather(Gold:Bronze, key = "medal", value = "count") %>%
uncount(count)
olympics %>%
ggplot(aes(x = reorder(country))) +
geom_bar() +
coord_flip()
olympics %>%
ggplot(aes(x = country)) +
geom_bar() +
coord_flip()
str(olympics)
olympics <- olympics %>%
mutate(country = factor(country))
str(olympics)
levels(olympics$country)
olympics <- olympics %>%
mutate(country = reorder(country, Total))
levels(olympics$country)
olympics %>%
ggplot(aes(x = country)) +
geom_bar() +
coord_flip()
olympics %>%
ggplot(aes(x = country, fill = medal)) +
geom_bar() +
coord_flip()
olympics <- olympics %>%
mutate(medal = factor(medal, levels = c("Gold", "Silver", "Bronze")))
olympics %>%
ggplot(aes(x = country, fill = medal)) +
geom_bar() +
coord_flip() +
scale_fill_manual(values = cols)
cols <- c("Gold" = "#CFB53B", "Silver" = "#E6E8FA", "Bronze" = "#8C7853")
olympics %>%
ggplot(aes(x = country, fill = medal)) +
geom_bar() +
coord_flip() +
scale_fill_manual(values = cols)
olympics %>%
ggplot(aes(x = country, fill = medal)) +
geom_bar() +
coord_flip() +
scale_fill_manual(values = cols) +
theme_bw()
as_date(now())
## [1] "2018-02-22"